import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
import scipy.stats as ss
#import warnings
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)
import funciones_auxiliares as f_aux
df_fraud = pd.read_csv("../data/df_data_initial_preprocessing.csv").drop("Unnamed: 0", axis = 1)
df_fraud.shape
(1000000, 32)
df_fraud
| intended_balcon_amount | prev_address_months_count | bank_months_count | current_address_months_count | session_length_in_minutes | device_distinct_emails_8w | fraud_bool | foreign_request | phone_mobile_valid | has_other_cards | proposed_credit_limit | device_os | source | housing_status | keep_alive_session | device_fraud_count | phone_home_valid | credit_risk_score | email_is_free | income | employment_status | date_of_birth_distinct_emails_4w | bank_branch_count_8w | velocity_4w | velocity_24h | velocity_6h | zip_count_4w | payment_type | days_since_request | customer_age | name_email_similarity | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | 24.0 | 88.0 | 3.888115 | 1.0 | 1 | 0.0 | 0.0 | 0.0 | 500.0 | windows | INTERNET | BA | 0.0 | 0.0 | 1.0 | 185.0 | 0.0 | 0.9 | CA | 6.0 | 1.0 | 3863.647740 | 3134.319630 | 10650.765523 | 769.0 | AA | 0.020925 | 50.0 | 0.166828 | 7.0 |
| 1 | NaN | NaN | 15.0 | 144.0 | 31.798819 | 1.0 | 1 | 0.0 | 0.0 | 0.0 | 1500.0 | windows | INTERNET | BA | 0.0 | 0.0 | 0.0 | 259.0 | 1.0 | 0.9 | CA | 3.0 | 718.0 | 3124.298166 | 2670.918292 | 534.047319 | 366.0 | AB | 0.005418 | 50.0 | 0.296286 | 7.0 |
| 2 | NaN | NaN | NaN | 132.0 | 4.728705 | 1.0 | 1 | 0.0 | 1.0 | 0.0 | 200.0 | other | INTERNET | BA | 0.0 | 0.0 | 0.0 | 177.0 | 1.0 | 0.9 | CB | 14.0 | 1.0 | 3159.590679 | 2893.621498 | 4048.534263 | 870.0 | AC | 3.108549 | 40.0 | 0.044985 | 7.0 |
| 3 | NaN | NaN | 31.0 | 22.0 | 2.047904 | 1.0 | 1 | 0.0 | 1.0 | 1.0 | 200.0 | linux | INTERNET | BA | 0.0 | 0.0 | 0.0 | 110.0 | 1.0 | 0.9 | CA | 6.0 | 1921.0 | 3022.261812 | 4054.908412 | 3457.064063 | 810.0 | AB | 0.019079 | 50.0 | 0.159511 | 7.0 |
| 4 | NaN | NaN | 31.0 | 218.0 | 3.775225 | 1.0 | 1 | 0.0 | 0.0 | 0.0 | 1500.0 | macintosh | INTERNET | BA | 1.0 | 0.0 | 1.0 | 295.0 | 1.0 | 0.9 | CA | 2.0 | 1990.0 | 3087.670952 | 2728.237159 | 5020.341679 | 890.0 | AB | 0.004441 | 50.0 | 0.596414 | 7.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 999995 | NaN | NaN | 25.0 | 104.0 | 8.511502 | 1.0 | 0 | 0.0 | 1.0 | 0.0 | 200.0 | linux | INTERNET | BC | 1.0 | 0.0 | 1.0 | 75.0 | 1.0 | 0.6 | CA | 8.0 | 1.0 | 4972.635997 | 8341.468557 | 7905.711839 | 804.0 | AB | 0.030592 | 40.0 | 0.192631 | 4.0 |
| 999996 | NaN | 148.0 | NaN | 9.0 | 8.967865 | 1.0 | 0 | 0.0 | 1.0 | 0.0 | 200.0 | windows | INTERNET | BC | 0.0 | 0.0 | 1.0 | 154.0 | 1.0 | 0.8 | CC | 2.0 | 0.0 | 5022.728108 | 4955.170808 | 5391.470463 | 3306.0 | AC | 1.628119 | 50.0 | 0.322989 | 4.0 |
| 999997 | 34.692760 | NaN | 11.0 | 30.0 | 8.195531 | 1.0 | 0 | 0.0 | 1.0 | 0.0 | 200.0 | other | INTERNET | BC | 0.0 | 0.0 | 0.0 | 64.0 | 0.0 | 0.8 | CF | 6.0 | 2023.0 | 4377.196321 | 5670.654316 | 8063.102636 | 1522.0 | AA | 0.018563 | 20.0 | 0.879403 | 4.0 |
| 999998 | 94.661055 | NaN | 28.0 | 189.0 | 4.336064 | 1.0 | 0 | 0.0 | 0.0 | 0.0 | 500.0 | windows | INTERNET | BA | 1.0 | 0.0 | 1.0 | 163.0 | 0.0 | 0.9 | CA | 6.0 | 1678.0 | 4394.803296 | 3982.582204 | 8092.641762 | 1418.0 | AA | 0.015352 | 20.0 | 0.762112 | 4.0 |
| 999999 | 9.908499 | NaN | 15.0 | 321.0 | 6.717022 | 1.0 | 0 | 0.0 | 1.0 | 0.0 | 200.0 | linux | INTERNET | BE | 0.0 | 0.0 | 0.0 | 36.0 | 1.0 | 0.2 | CA | 12.0 | 2.0 | 4352.334543 | 3695.308261 | 6169.630036 | 951.0 | AA | 2.655916 | 20.0 | 0.697452 | 4.0 |
1000000 rows × 32 columns
df_fraud.head()
| intended_balcon_amount | prev_address_months_count | bank_months_count | current_address_months_count | session_length_in_minutes | device_distinct_emails_8w | fraud_bool | foreign_request | phone_mobile_valid | has_other_cards | proposed_credit_limit | device_os | source | housing_status | keep_alive_session | device_fraud_count | phone_home_valid | credit_risk_score | email_is_free | income | employment_status | date_of_birth_distinct_emails_4w | bank_branch_count_8w | velocity_4w | velocity_24h | velocity_6h | zip_count_4w | payment_type | days_since_request | customer_age | name_email_similarity | month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | 24.0 | 88.0 | 3.888115 | 1.0 | 1 | 0.0 | 0.0 | 0.0 | 500.0 | windows | INTERNET | BA | 0.0 | 0.0 | 1.0 | 185.0 | 0.0 | 0.9 | CA | 6.0 | 1.0 | 3863.647740 | 3134.319630 | 10650.765523 | 769.0 | AA | 0.020925 | 50.0 | 0.166828 | 7.0 |
| 1 | NaN | NaN | 15.0 | 144.0 | 31.798819 | 1.0 | 1 | 0.0 | 0.0 | 0.0 | 1500.0 | windows | INTERNET | BA | 0.0 | 0.0 | 0.0 | 259.0 | 1.0 | 0.9 | CA | 3.0 | 718.0 | 3124.298166 | 2670.918292 | 534.047319 | 366.0 | AB | 0.005418 | 50.0 | 0.296286 | 7.0 |
| 2 | NaN | NaN | NaN | 132.0 | 4.728705 | 1.0 | 1 | 0.0 | 1.0 | 0.0 | 200.0 | other | INTERNET | BA | 0.0 | 0.0 | 0.0 | 177.0 | 1.0 | 0.9 | CB | 14.0 | 1.0 | 3159.590679 | 2893.621498 | 4048.534263 | 870.0 | AC | 3.108549 | 40.0 | 0.044985 | 7.0 |
| 3 | NaN | NaN | 31.0 | 22.0 | 2.047904 | 1.0 | 1 | 0.0 | 1.0 | 1.0 | 200.0 | linux | INTERNET | BA | 0.0 | 0.0 | 0.0 | 110.0 | 1.0 | 0.9 | CA | 6.0 | 1921.0 | 3022.261812 | 4054.908412 | 3457.064063 | 810.0 | AB | 0.019079 | 50.0 | 0.159511 | 7.0 |
| 4 | NaN | NaN | 31.0 | 218.0 | 3.775225 | 1.0 | 1 | 0.0 | 0.0 | 0.0 | 1500.0 | macintosh | INTERNET | BA | 1.0 | 0.0 | 1.0 | 295.0 | 1.0 | 0.9 | CA | 2.0 | 1990.0 | 3087.670952 | 2728.237159 | 5020.341679 | 890.0 | AB | 0.004441 | 50.0 | 0.596414 | 7.0 |
df_fraud.columns
Index(['intended_balcon_amount', 'prev_address_months_count',
'bank_months_count', 'current_address_months_count',
'session_length_in_minutes', 'device_distinct_emails_8w', 'fraud_bool',
'foreign_request', 'phone_mobile_valid', 'has_other_cards',
'proposed_credit_limit', 'device_os', 'source', 'housing_status',
'keep_alive_session', 'device_fraud_count', 'phone_home_valid',
'credit_risk_score', 'email_is_free', 'income', 'employment_status',
'date_of_birth_distinct_emails_4w', 'bank_branch_count_8w',
'velocity_4w', 'velocity_24h', 'velocity_6h', 'zip_count_4w',
'payment_type', 'days_since_request', 'customer_age',
'name_email_similarity', 'month'],
dtype='object')
categoricals_list, other = f_aux.dame_variables_categoricas(dataset = df_fraud)
df_fraud[categoricals_list] = df_fraud[categoricals_list].astype("category")
continuous_list = list(df_fraud.select_dtypes("float").columns)
df_fraud[continuous_list] = df_fraud[continuous_list].astype(float)
df_fraud.dtypes #aquí estamos comprobando en todas las columnas de qué tipo son
intended_balcon_amount float64 prev_address_months_count float64 bank_months_count float64 current_address_months_count float64 session_length_in_minutes float64 device_distinct_emails_8w float64 fraud_bool category foreign_request float64 phone_mobile_valid float64 has_other_cards float64 proposed_credit_limit float64 device_os category source category housing_status category keep_alive_session float64 device_fraud_count float64 phone_home_valid float64 credit_risk_score float64 email_is_free float64 income float64 employment_status category date_of_birth_distinct_emails_4w float64 bank_branch_count_8w float64 velocity_4w float64 velocity_24h float64 velocity_6h float64 zip_count_4w float64 payment_type category days_since_request float64 customer_age float64 name_email_similarity float64 month float64 dtype: object
# Calculate percentages and reset index for the first DataFrame
df_fraud_bool = df_fraud["fraud_bool"]\
.value_counts(normalize = True)\
.mul(100).rename("percent").reset_index()
# Calculate counts and reset index for the second DataFrame
df_fraud_bool_conteo = df_fraud["fraud_bool"].value_counts().reset_index()
# Merge the two DataFrames on the "fraud_bool" column
df_fraud_bool_pc = pd.merge(df_fraud_bool, df_fraud_bool_conteo, on = "fraud_bool", how = 'inner')\
.rename(columns = {"fraud_bool": "index"})
df_fraud_bool_pc
fig = px.histogram(df_fraud_bool_pc, x = "index", y = ['percent'])
fig.update_xaxes(tickvals = [0, 1])
fig.show()
from sklearn.model_selection import train_test_split
x_df_fraud, x_df_fraud_test, y_df_fraud, y_df_fraud_test = train_test_split(
df_fraud.drop("fraud_bool",axis=1),
df_fraud["fraud_bool"],
stratify = df_fraud["fraud_bool"],
test_size = 0.2)
df_fraud_train = pd.concat([x_df_fraud, y_df_fraud], axis = 1)
df_fraud_test = pd.concat([x_df_fraud_test, y_df_fraud_test], axis = 1)
df_fraud_test
| intended_balcon_amount | prev_address_months_count | bank_months_count | current_address_months_count | session_length_in_minutes | device_distinct_emails_8w | foreign_request | phone_mobile_valid | has_other_cards | proposed_credit_limit | device_os | source | housing_status | keep_alive_session | device_fraud_count | phone_home_valid | credit_risk_score | email_is_free | income | employment_status | date_of_birth_distinct_emails_4w | bank_branch_count_8w | velocity_4w | velocity_24h | velocity_6h | zip_count_4w | payment_type | days_since_request | customer_age | name_email_similarity | month | fraud_bool | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 164893 | NaN | NaN | NaN | 34.0 | 6.284015 | 1.0 | 0.0 | 1.0 | 0.0 | 500.0 | other | INTERNET | BA | 0.0 | 0.0 | 0.0 | 158.0 | 0.0 | 0.1 | CA | 8.0 | 0.0 | 4627.661949 | 5892.208076 | 6300.352459 | 1066.0 | AC | 14.278374 | 30.0 | 0.872882 | 3.0 | 0 |
| 259912 | NaN | NaN | 19.0 | 197.0 | 3.463159 | 1.0 | 0.0 | 1.0 | 0.0 | 200.0 | macintosh | INTERNET | BB | 1.0 | 0.0 | 0.0 | 53.0 | 0.0 | 0.1 | CA | 13.0 | 12.0 | 5136.370224 | 5662.654955 | 8540.003798 | 829.0 | AD | 0.038717 | 20.0 | 0.425354 | 2.0 | 0 |
| 158133 | 26.750203 | NaN | 31.0 | 184.0 | 3.512410 | 1.0 | 0.0 | 1.0 | 1.0 | 1500.0 | other | INTERNET | BA | 0.0 | 0.0 | 0.0 | 185.0 | 0.0 | 0.8 | CA | 12.0 | 354.0 | 5032.692675 | 4076.097781 | 4712.130828 | 1966.0 | AA | 0.009290 | 30.0 | 0.127620 | 3.0 | 0 |
| 974829 | NaN | 41.0 | 1.0 | NaN | 2.965838 | 1.0 | 0.0 | 1.0 | 0.0 | 200.0 | other | INTERNET | BC | 1.0 | 0.0 | 0.0 | 144.0 | 0.0 | 0.8 | CA | 8.0 | 8.0 | 3107.382105 | 2828.137810 | 3993.961844 | 1422.0 | AD | 0.015090 | 30.0 | 0.098489 | 4.0 | 0 |
| 900372 | NaN | NaN | 1.0 | 11.0 | 4.969989 | 1.0 | 0.0 | 0.0 | 1.0 | 200.0 | linux | INTERNET | BC | 0.0 | 0.0 | 0.0 | 130.0 | 1.0 | 0.5 | CA | 14.0 | 6.0 | 4759.997038 | 5400.575658 | 10078.408844 | 2077.0 | AB | 0.006393 | 30.0 | 0.851772 | 4.0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 798488 | NaN | NaN | 25.0 | 30.0 | 2.140956 | 1.0 | 0.0 | 1.0 | 0.0 | 200.0 | linux | INTERNET | BB | 0.0 | 0.0 | 0.0 | 70.0 | 1.0 | 0.6 | CA | 19.0 | 14.0 | 4411.063001 | 7368.651690 | 4045.938748 | 1540.0 | AB | 0.001100 | 30.0 | 0.655980 | 5.0 | 0 |
| 216923 | NaN | NaN | 15.0 | 119.0 | 1.372113 | 1.0 | 0.0 | 1.0 | 0.0 | 200.0 | other | INTERNET | BA | 0.0 | 0.0 | 1.0 | 172.0 | 0.0 | 0.8 | CA | 4.0 | 13.0 | 4850.387098 | 5556.945586 | 3468.782135 | 1490.0 | AB | 0.010512 | 40.0 | 0.087478 | 3.0 | 0 |
| 38651 | NaN | 28.0 | 28.0 | 6.0 | 7.358852 | 1.0 | 0.0 | 0.0 | 0.0 | 990.0 | windows | INTERNET | BB | 1.0 | 0.0 | 1.0 | 117.0 | 1.0 | 0.1 | CA | 10.0 | 15.0 | 3046.269860 | 2193.108049 | 1639.216399 | 630.0 | AD | 0.000322 | 30.0 | 0.708867 | 7.0 | 0 |
| 855203 | 52.649272 | NaN | 1.0 | 178.0 | 6.331757 | 1.0 | 0.0 | 1.0 | 1.0 | 1500.0 | other | INTERNET | BE | 0.0 | 0.0 | 0.0 | 228.0 | 1.0 | 0.9 | CA | 2.0 | 19.0 | 4169.087757 | 4579.886623 | 4325.898342 | 735.0 | AA | 0.016136 | 30.0 | 0.788005 | 5.0 | 0 |
| 677134 | NaN | 11.0 | 26.0 | 8.0 | 3.204506 | 2.0 | 0.0 | 1.0 | 0.0 | 200.0 | linux | INTERNET | BC | 0.0 | 0.0 | 1.0 | 89.0 | 1.0 | 0.3 | CA | 14.0 | 4.0 | 5986.894001 | 5613.811715 | 4928.563151 | 1441.0 | AB | 0.010249 | 30.0 | 0.546968 | 0.0 | 0 |
200000 rows × 32 columns
print("", '== Train\n', df_fraud_train["fraud_bool"].value_counts(normalize = True))
print("""
""", '== Test\n', df_fraud_test["fraud_bool"].value_counts(normalize = True))
== Train fraud_bool 0 0.988971 1 0.011029 Name: proportion, dtype: float64 == Test fraud_bool 0 0.98897 1 0.01103 Name: proportion, dtype: float64
fraud_series_null_columns = df_fraud_train.isnull().sum().sort_values(ascending = False)
fraud_series_null_rows = df_fraud_train.isnull().sum(axis = 1).sort_values(ascending = False)
df_null_columns = pd.DataFrame(fraud_series_null_columns, columns = ["null_columns"])
df_null_rows = pd.DataFrame(fraud_series_null_rows, columns = ["null_rows"])
df_null_rows["target"] = df_fraud["fraud_bool"]
df_null_columns["column_percentage"] = df_null_columns["null_columns"] / df_fraud.shape[0]
df_null_rows["row_percentage"] = df_null_rows["null_rows"] / df_fraud.shape[1]
df_null_columns.head(6)
| null_columns | column_percentage | |
|---|---|---|
| intended_balcon_amount | 593888 | 0.593888 |
| prev_address_months_count | 570093 | 0.570093 |
| bank_months_count | 202769 | 0.202769 |
| current_address_months_count | 3427 | 0.003427 |
| session_length_in_minutes | 1582 | 0.001582 |
| device_distinct_emails_8w | 289 | 0.000289 |
intended_balcon_amount: Cerca del 59.45% de los valores en esta columna son nulos.
prev_address_months_count: Alrededor del 57.04% de los valores en esta columna son nulos.
bank_months_count: Aproximadamente el 20.33% de los valores en esta columna son nulos.
current_address_months_count: Un 0.34% de los valores en esta columna son nulos.
session_length_in_minutes: Un 0.16% de los valores en esta columna son nulos.
device_distinct_emails_8w: Solo alrededor del 0.03% de los valores en esta columna son nulos.
len(df_fraud_train.columns)
32
len(df_fraud_train.columns[:16])
16
len(df_fraud_train.columns[16:])
16
for i in list(df_fraud_train.columns[:16]):
if (df_fraud_train[i].dtype == float) & (i != "fraud_bool"):
f_aux.plot_feature(df_fraud_train, col_name = i, isContinuous = True, target = "fraud_bool")
elif i != "fraud_bool":
f_aux.plot_feature(df_fraud_train, col_name = i, isContinuous = False, target = "fraud_bool")
for i in list(df_fraud_train.columns[16:]):
if (df_fraud_train[i].dtype == float) & (i != "fraud_bool"):
f_aux.plot_feature(df_fraud_train, col_name = i, isContinuous = True, target = "fraud_bool")
elif i != "fraud_bool":
f_aux.plot_feature(df_fraud_train, col_name = i, isContinuous = False, target = "fraud_bool")
continuous_list
['intended_balcon_amount', 'prev_address_months_count', 'bank_months_count', 'current_address_months_count', 'session_length_in_minutes', 'device_distinct_emails_8w', 'foreign_request', 'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit', 'keep_alive_session', 'device_fraud_count', 'phone_home_valid', 'credit_risk_score', 'email_is_free', 'income', 'date_of_birth_distinct_emails_4w', 'bank_branch_count_8w', 'velocity_4w', 'velocity_24h', 'velocity_6h', 'zip_count_4w', 'days_since_request', 'customer_age', 'name_email_similarity', 'month']
f_aux.get_deviation_of_mean_perc(df_fraud_train, continuous_list, target = "fraud_bool", multiplier = 3)
| no_fraud | fraud | variable | sum_outlier_values | porcentaje_sum_outlier_values | |
|---|---|---|---|---|---|
| 0 | 0.989024 | 0.010976 | intended_balcon_amount | 1640 | 0.002050 |
| 1 | 0.993193 | 0.006807 | prev_address_months_count | 6905 | 0.008631 |
| 2 | 0.984446 | 0.015554 | current_address_months_count | 16845 | 0.021056 |
| 3 | 0.980334 | 0.019666 | session_length_in_minutes | 18814 | 0.023518 |
| 4 | 0.961992 | 0.038008 | device_distinct_emails_8w | 25179 | 0.031474 |
| 5 | 0.977977 | 0.022023 | foreign_request | 20206 | 0.025257 |
| 6 | 0.867963 | 0.132037 | proposed_credit_limit | 4938 | 0.006173 |
| 7 | 0.967928 | 0.032072 | credit_risk_score | 2775 | 0.003469 |
| 8 | 0.994164 | 0.005836 | date_of_birth_distinct_emails_4w | 4969 | 0.006211 |
| 9 | 0.989432 | 0.010568 | bank_branch_count_8w | 32741 | 0.040926 |
| 10 | 0.995327 | 0.004673 | velocity_24h | 428 | 0.000535 |
| 11 | 0.992505 | 0.007495 | velocity_6h | 3469 | 0.004336 |
| 12 | 0.990815 | 0.009185 | zip_count_4w | 12956 | 0.016195 |
| 13 | 0.987651 | 0.012349 | days_since_request | 14171 | 0.017714 |
| 14 | 0.957541 | 0.042459 | customer_age | 6312 | 0.007890 |
En la tabla de arriba podemos ver todos los outliers de todas las columnas contínuas del conjunto de datos. El dataset en su totalidad tiene un porcentaje de fraude de solo 2% y 98% de no tener fraude. Teniendo esto en cuenta, tiene sentido que la mayoria de los outliers tengan un porcentaje de fraude y no fraude parecido al de 2% y 98% respectivamente, pero podemos ver que esto no es lo que se observa en la variable de "proposed_credit_limit" donde cuentas bancarias que han cometido fraude son el 13% de los outliers, por lo que habría que revisar los outliers de esta variable más detenidamente ya que se excede de lo normal.
Además, las siguientes variables son las que tienen un porcentaje de outliers más altos en el conjunto de datos:
bank_branch_count_8w: Cerca del 4.1% de los valores son considerados atípicos.
current_address_months_count: Alrededor del 2.14% de los valores son considerados atípicos.
session_length_in_minutes: Aproximadamente el 2.35% de los valores son considerados atípicos.
device_distinct_emails_8w: Un 3.16% de los valores son considerados atípicos.
proposed_credit_limit: Cerca del 0.61% de los valores son considerados atípicos.
credit_risk_score: Alrededor del 0.35% de los valores son considerados atípicos. Lo mismo con los demás.
Se puede observar que, en las variables de "device_distinct_emails_8w", "credit_risk_score", y especialmente "customer_age", los outlier tienen un porcentaje mucho más alto de haber cometido fraude (36.9%, 33.6%, y 40.1%) comparado con los outliers del resto de las variables contínuas.
A continuación, correlaciones entre las diferentes variables:
# Correlaciones entre las diferentes variables
f_aux.get_corr_matrix(dataset = df_fraud_train[continuous_list], metodo = 'pearson', size_figure = [8,6])
0
continuous_list.remove('device_fraud_count')
f_aux.get_corr_matrix(dataset = df_fraud_train[continuous_list], metodo = 'pearson', size_figure = [8,6])
0
level_0: Variable en el primer nivel de la correlación.
level_1: Variable en el segundo nivel de la correlación.
Correlation: Valor de correlación entre las dos variables indicadas en level_0 y level_1.
Las correlaciones indican cómo dos variables están relacionadas entre sí. Aquí hay algunas observaciones sobre las correlaciones presentadas:
Comprobando que variables tienen una correlacón entre sí más alta del 0.4%:
Las variables de month y velocity_4w tienen una fuerte correlación positive de 0.85. Además, existe una correlación positiva moderada (0.61) entre credit_risk_score y proposed_credit_limit.
corr = df_fraud_train[continuous_list].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k = -1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation'] > 0.4]
| level_0 | level_1 | correlation | |
|---|---|---|---|
| 617 | month | velocity_4w | 0.847791 |
| 309 | credit_risk_score | proposed_credit_limit | 0.607147 |
| 618 | month | velocity_24h | 0.549711 |
| 467 | velocity_24h | velocity_4w | 0.538393 |
| 493 | velocity_6h | velocity_24h | 0.463799 |
| 565 | customer_age | date_of_birth_distinct_emails_4w | 0.419411 |
| 619 | month | velocity_6h | 0.409437 |
| 492 | velocity_6h | velocity_4w | 0.400548 |
¿Son todos los nulos de una clase de la variable objetivo? o tienen el mismo porcentaje de la variable objetivo?
df_fraud_train["intended_balcon_amount"].sort_index(ascending = True)
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
...
999994 NaN
999995 NaN
999996 NaN
999997 34.692760
999999 9.908499
Name: intended_balcon_amount, Length: 800000, dtype: float64
continuous_list
['intended_balcon_amount', 'prev_address_months_count', 'bank_months_count', 'current_address_months_count', 'session_length_in_minutes', 'device_distinct_emails_8w', 'foreign_request', 'phone_mobile_valid', 'has_other_cards', 'proposed_credit_limit', 'keep_alive_session', 'phone_home_valid', 'credit_risk_score', 'email_is_free', 'income', 'date_of_birth_distinct_emails_4w', 'bank_branch_count_8w', 'velocity_4w', 'velocity_24h', 'velocity_6h', 'zip_count_4w', 'days_since_request', 'customer_age', 'name_email_similarity', 'month']
f_aux.get_percent_null_values_target(df_fraud_train, continuous_list, target = "fraud_bool")
| no_fraud | fraud | variable | sum_null_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.986885 | 0.013115 | intended_balcon_amount | 593888 | 0.742360 |
| 1 | 0.985771 | 0.014229 | prev_address_months_count | 570093 | 0.712616 |
| 2 | 0.983740 | 0.016260 | bank_months_count | 202769 | 0.253461 |
| 3 | 0.996207 | 0.003793 | current_address_months_count | 3427 | 0.004284 |
| 4 | 0.993047 | 0.006953 | session_length_in_minutes | 1582 | 0.001978 |
| 5 | 0.989619 | 0.010381 | device_distinct_emails_8w | 289 | 0.000361 |
Imputando los valores nulos de las variables "device_distinct_emails_8w", "session_length_in_minutes", "current_address_months_count", y "intended_balcon_amount" por sus medias dado que el porcentaje de nulos a lo largo del conjunto de datos es bajo en las cuatro variables:
df_fraud[["device_distinct_emails_8w", "session_length_in_minutes", "current_address_months_count",
"bank_months_count"]] = df_fraud[["device_distinct_emails_8w", "session_length_in_minutes",
"current_address_months_count", "bank_months_count"]].apply(lambda x: x.fillna(x.mean()))
f_aux.get_percent_null_values_target(df_fraud, continuous_list, target = "fraud_bool")
| no_fraud | fraud | variable | sum_null_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.986873 | 0.013127 | intended_balcon_amount | 742523 | 0.742523 |
| 1 | 0.985785 | 0.014215 | prev_address_months_count | 712920 | 0.712920 |
| 2 | NaN | NaN | bank_months_count | 0 | 0.000000 |
| 3 | NaN | NaN | current_address_months_count | 0 | 0.000000 |
| 4 | NaN | NaN | session_length_in_minutes | 0 | 0.000000 |
| 5 | NaN | NaN | device_distinct_emails_8w | 0 | 0.000000 |
Dado que las otras dos variables con nulos tienen un porcentaje de elementos igual a nulo muy alto, uso el modelo de regresión de KNNImputer para imputar los valores missing de estas variables ("bank_months_count" y "prev_address_months_count") usando como regresoras todas las variables continuas:
df_fraud_train[["device_distinct_emails_8w", "session_length_in_minutes", "current_address_months_count",
"bank_months_count"]] = df_fraud_train[["device_distinct_emails_8w", "session_length_in_minutes",
"current_address_months_count", "bank_months_count"]].apply(lambda x: x.fillna(x.mean()))
f_aux.get_percent_null_values_target(df_fraud_train, continuous_list, target = "fraud_bool")
| no_fraud | fraud | variable | sum_null_values | porcentaje_sum_null_values | |
|---|---|---|---|---|---|
| 0 | 0.986885 | 0.013115 | intended_balcon_amount | 593888 | 0.742360 |
| 1 | 0.985771 | 0.014229 | prev_address_months_count | 570093 | 0.712616 |
| 2 | NaN | NaN | bank_months_count | 0 | 0.000000 |
| 3 | NaN | NaN | current_address_months_count | 0 | 0.000000 |
| 4 | NaN | NaN | session_length_in_minutes | 0 | 0.000000 |
| 5 | NaN | NaN | device_distinct_emails_8w | 0 | 0.000000 |
Idealmente, en el tratamiento de datos nulos, usariamos el código de debajo para imputar los valores nulos de las dos columnas que tienen un porcentaje de nulos de más de 70% con el modelo de regresión de KNN Imputer, usando como regresoras todas la variables contínuas. Esto sería porque los valores que imputaría la regression KNN en vez de los nulos ayudarían a crear modelos mucho más precisos que si simplemente imputaramos la media de cada columna donde se encuentren los nulos. Sin embargo, como la celda de código de debajo tiene un tiempo de ejecución extremadamente largo, esto segundo es justo lo que vamos a hacer para poder crear predicciones (aunque vayan a ser menos precisas de lo que sería teóricamente posible),
#X_train = df_fraud_train[continuous_list]
#X_test = df_fraud_test_2[continuous_list]
#imputer = KNNImputer(n_neighbors = 2, weights = "uniform")
#model = imputer.fit(X_train)
#df_input_train = pd.DataFrame(model.transform(X_train),
# columns = [i+'_input' for i in list(set(continuous_list))],index=df_fraud_train.index)
#df_input_test = pd.DataFrame(model.transform(X_test),
# columns = [i+'_input' for i in list(set(continuous_list))],index=df_fraud_test_2.index)
df_fraud[continuous_list] = df_fraud[continuous_list].apply(lambda x: x.fillna(x.mean()))
x_df_fraud, x_df_fraud_test, y_df_fraud, y_df_fraud_test = train_test_split(
df_fraud.drop("fraud_bool",axis=1),
df_fraud["fraud_bool"],
stratify = df_fraud["fraud_bool"],
test_size = 0.2)
df_fraud_train = pd.concat([x_df_fraud, y_df_fraud], axis = 1)
df_fraud_test = pd.concat([x_df_fraud_test, y_df_fraud_test], axis = 1)
f_aux.get_corr_matrix(dataset = df_fraud_train[continuous_list], metodo = 'pearson', size_figure = [8, 6])
0
f_aux.get_percent_null_values_target(df_fraud_train, continuous_list, target = "fraud_bool")
No existen variables con valores nulos
Para la correlacion de spearman es necesario convertir las variables categoricas en numericas y luego obtener la correlación
categoricals_list
['fraud_bool', 'device_os', 'source', 'housing_status', 'employment_status', 'payment_type']
def cramers_v(confusion_matrix):
"""
calculate Cramers V statistic for categorical-categorical association.
uses correction from Bergsma and Wicher,
Journal of the Korean Statistical Society 42 (2013): 323-328
confusion_matrix tabla creada con pd.crosstab()
"""
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
confusion_matrix = pd.crosstab(df_fraud_train["fraud_bool"], df_fraud_train["device_os"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
device_os linux macintosh other windows x11 fraud_bool 0 264804 42377 272500 205771 5725 1 1357 600 1572 5227 67
0.0809818636850644
confusion_matrix = pd.crosstab(df_fraud_train["fraud_bool"], df_fraud_train["fraud_bool"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
fraud_bool 0 1 fraud_bool 0 791177 0 1 0 8823
0.9999426978916621
confusion_matrix = pd.crosstab(df_fraud_train["fraud_bool"], df_fraud_train["source"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
source INTERNET TELEAPP fraud_bool 0 785574 5603 1 8727 96
0.0045096844811087725
categoricals_list
['fraud_bool', 'device_os', 'source', 'housing_status', 'employment_status', 'payment_type']
df_fraud["payment_type"].unique()
['AA', 'AB', 'AC', 'AD', 'AE'] Categories (5, object): ['AA', 'AB', 'AC', 'AD', 'AE']
device_map = {'windows': 1, 'other': 2, 'linux': 3, 'macintosh': 4, "x11": 5}
source_map = {"INTERNET": 1, "TELEAPP": 2}
housing_map = {'BA': 1, 'BB': 2, 'BC': 3, 'BD': 4, "BE": 5, 'BF': 6, 'BG': 7}
employment_map = {'CA': 1, 'CB': 2, 'CC': 3, 'CD': 4, "CE": 5, 'CF': 6, 'CG': 7}
payment_map = {'AA': 1, 'AB': 2, 'AC': 3, 'AD': 4, "AE": 5, 'AF': 6, 'AG': 7}
df_fraud['device_os'] = df_fraud['device_os'].map(device_map).astype("float64")
df_fraud['source'] = df_fraud['source'].map(source_map).astype("float64")
df_fraud['housing_status'] = df_fraud['housing_status'].map(housing_map).astype("float64")
df_fraud['employment_status'] = df_fraud['employment_status'].map(employment_map).astype("float64")
df_fraud['payment_type'] = df_fraud['payment_type'].map(payment_map).astype("float64")
f_aux.get_corr_matrix(dataset = df_fraud[categoricals_list], metodo='spearman', size_figure=[5,4])
0
No hay valores nulos en las variables categoricas de este conjunto de datos, pero si hubiera hariamos lo siguiente:
df_fraud_train[categoricals_list] = df_fraud_train[categoricals_list].astype("object")\
.fillna("SIN VALOR").astype("category")
df_fraud_test[categoricals_list] = df_fraud_test[categoricals_list].astype("object").\
fillna("SIN VALOR").astype("category")
df_fraud_train.to_csv("../data/train_df_data_preprocessing_missing_outlier.csv")
df_fraud_test.to_csv("../data/test_df_data_preprocessing_missing_outlier.csv")